# Prepare NBER MORG data
# Depends on cps_functions.R
# all directories are relative to the root of the project

# setting options
options(mc.cores=2)

library(data.table)
library(dplyr)
## library(Hmisc)
library(ggplot2)
## library(ggsci)
library(haven)
library(EnvStats)

# read-in functions
source("./src/r/cps_functions.R")

# directory settings
data.dir <- "../data/raw/morg/"

## Occupation classification converters
occ1970.occ1990dd <- read_dta("./data/raw/daviddorn/occ1970_occ1990dd.dta")
occ1980.occ1990dd <- read_dta("./data/raw/daviddorn/occ1980_occ1990dd.dta")
occ1990.occ1990dd <- read_dta("./data/raw/daviddorn/occ1990_occ1990dd.dta")
occ2000.occ1990dd <- read_dta("./data/raw/daviddorn/occ2000_occ1990dd.dta")

occ2012.occ1990 <- fread("./data/generated/crosswalks/occ2012_occ1990.csv")
occ2012.occ1990$occ <- as.numeric(occ2012.occ1990$occ)
occ2012.occ1990$occ1990 <- as.numeric(occ2012.occ1990$occ1990)

occ00.occ1990 <- fread("./data/generated/crosswalks/occ00_occ1990.csv")
occ00.occ1990$occ <- as.numeric(occ00.occ1990$occ)
occ00.occ1990$occ1990 <- as.numeric(occ00.occ1990$occ1990)

occ1990dd.2digit <- fread("./data/raw/AcemogluAutor2011/census-prep-files/occ1990dd-recode.csv")
occ1990dd.2digit$occ1990dd <- as.numeric(occ1990dd.2digit$occ1990dd)


# Consumper price index obtained from https://fred.stlouisfed.org/series/DPCERG3A086NBEA
cpi <- fread("./data/raw/fred/DPCERG3A086NBEA.csv")
cpi <- cpi %>% rename(date=DATE,cpi=DPCERG3A086NBEA)
cpi$year <- as.numeric(substring(cpi$date,1,4))
base16 <- cpi$cpi[cpi$year==2016]
cpi <- cpi %>% mutate(cpi16=base16/cpi)

# Minimumwage from 82 transformed to 2016-dollars
minwage82 = 3.35
minwage16 = minwage82 * cpi$cpi16[cpi$year==1982]

varlist <- c("year","earnwt","hrwage16","hrwage16_pareto","hrwage16_original","cat3","earnwke16","uhourse","intmonth","gradeat","gradecp","ihigrdc","grade92","Recode","Recode_desc", "occ1990dd", 'age', 'race', 'sex', 'lfsr89', 'earnwke')
years <- c(1980:2015)
drop.low <- 0
drop.high <- 0
months <- c(1:12)

morg.all <- read.transform.data(years,varlist,drop.low,drop.high,months)

# stack data
data.stacked <- morg.all[[years[[1]]]]
if (length(years)>1){
  for (year in years[2:length(years)]){
    print(year)
    print(ncol(morg.all[[year]]))
      data.stacked <- bind_rows(data.stacked,morg.all[[year]])
  }
}

# generate indicators for the three occupations with ind1 = svc, ind2 = rt, ind3 = abs
data.stacked <- data.stacked %>% mutate(ind1 = ifelse(cat3 == 'svc',1,0)) %>%
  mutate(ind2 = ifelse(cat3 == 'rt',1,0)) %>%
    mutate(ind3 = ifelse(cat3 == 'abs',1,0))

# compute log wages
data.stacked <- data.stacked %>% mutate(ln_w = log(hrwage16), ln_w_pareto = log(hrwage16_pareto))

# drop police, firefighters and other law enforcement), create indicator for wage sample
data.stacked.dropped <- data.stacked %>% filter(!occ1990dd %in% c(417,418,423)) %>%
    mutate(wagesample = ifelse(uhourse>=35 & hrwage16>=0.5*minwage16 & !is.na(earnwke) & !is.na(cat3),1,0))

# save for computing summary statistics
fwrite(data.stacked.dropped,"./data/generated/r/morg_data_stacked_all_years.csv")


sumstats <- data.stacked.dropped %>%
  filter(!is.na(hrwage16), hrwage16 < Inf, !is.na(cat3)) %>%
  group_by(year, cat3) %>%
  summarize(avg_wage = mean(hrwage16, na.rm = TRUE)) %>%
  ungroup()

ggplot(sumstats, aes(x=year, y=avg_wage)) + geom_line(aes(color=cat3))

sumstats.relative <- sumstats %>% pivot_wider(names_from=cat3, values_from=avg_wage) %>%
  mutate(rt_abs = rt/abs, rt_man = rt/svc) %>%
  select(year, rt_abs, rt_man) %>%
  pivot_longer(starts_with('rt_'))

start_year <- 1980

norm_rt_abs = sumstats.relative$value[sumstats.relative$year==start_year & sumstats.relative$name=='rt_abs']
norm_rt_man = sumstats.relative$value[sumstats.relative$year==start_year & sumstats.relative$name=='rt_man']

sumstats.relative.norm <- sumstats.relative
sumstats.relative.norm$value[sumstats.relative.norm$name=='rt_abs'] <- sumstats.relative.norm$value[sumstats.relative.norm$name=='rt_abs']/norm_rt_abs
sumstats.relative.norm$value[sumstats.relative.norm$name=='rt_man'] <- sumstats.relative.norm$value[sumstats.relative.norm$name=='rt_man']/norm_rt_man

ggplot(sumstats.relative.norm %>% filter(year>=start_year), aes(x=year, y=value)) + geom_line(aes(linetype=name)) +
  scale_linetype_manual(name='', values=c(1, 2), labels=c('Routine/Abstract', 'Routine/Manual')) +
  xlab('Year') +
  ylab('Ratio of Occupational Wages (1980=1)') +
  theme_bw() +
  theme(legend.position='top')

## data directory
dir.project <- './'

## figures directory for output
dir.fig <- paste0(dir.project, 'doc/referee reports/')

ggsave(
  paste0(dir.fig, "relative_wages_over_time.pdf"),
  width = 10,
  height = 8,
  units = "cm"
)
